library(tidyverse)
library(skimr)
library(quanteda)
library(quanteda.textmodels)
library(jsonlite)
library(data.table)
library(stringr)
library(caret)
library(knitr)
library(e1071)
library(irlba)
library(randomForest)
library(doSNOW)
library(cleanNLP)

knitr::opts_chunk$set(echo = TRUE, warning=FALSE)

Data prep

Data came in from a json file and converted to a data frame on import. To clean, I make the categorical variabled into factors, add a unique review id, spread an array variable of helpful votes and total votes into two separate columns, and create a target variable for the prediction section of if a review has any helpful votes.

# import data into data.frame
reviews_raw <- stream_in(file("reviews_Tools_and_Home_Improvement_5.json"))
## 
 Found 500 records...
 Found 1000 records...
 Found 1500 records...
 Found 2000 records...
 Found 2500 records...
 Found 3000 records...
 Found 3500 records...
 Found 4000 records...
 Found 4500 records...
 Found 5000 records...
 Found 5500 records...
 Found 6000 records...
 Found 6500 records...
 Found 7000 records...
 Found 7500 records...
 Found 8000 records...
 Found 8500 records...
 Found 9000 records...
 Found 9500 records...
 Found 10000 records...
 Found 10500 records...
 Found 11000 records...
 Found 11500 records...
 Found 12000 records...
 Found 12500 records...
 Found 13000 records...
 Found 13500 records...
 Found 14000 records...
 Found 14500 records...
 Found 15000 records...
 Found 15500 records...
 Found 16000 records...
 Found 16500 records...
 Found 17000 records...
 Found 17500 records...
 Found 18000 records...
 Found 18500 records...
 Found 19000 records...
 Found 19500 records...
 Found 20000 records...
 Found 20500 records...
 Found 21000 records...
 Found 21500 records...
 Found 22000 records...
 Found 22500 records...
 Found 23000 records...
 Found 23500 records...
 Found 24000 records...
 Found 24500 records...
 Found 25000 records...
 Found 25500 records...
 Found 26000 records...
 Found 26500 records...
 Found 27000 records...
 Found 27500 records...
 Found 28000 records...
 Found 28500 records...
 Found 29000 records...
 Found 29500 records...
 Found 30000 records...
 Found 30500 records...
 Found 31000 records...
 Found 31500 records...
 Found 32000 records...
 Found 32500 records...
 Found 33000 records...
 Found 33500 records...
 Found 34000 records...
 Found 34500 records...
 Found 35000 records...
 Found 35500 records...
 Found 36000 records...
 Found 36500 records...
 Found 37000 records...
 Found 37500 records...
 Found 38000 records...
 Found 38500 records...
 Found 39000 records...
 Found 39500 records...
 Found 40000 records...
 Found 40500 records...
 Found 41000 records...
 Found 41500 records...
 Found 42000 records...
 Found 42500 records...
 Found 43000 records...
 Found 43500 records...
 Found 44000 records...
 Found 44500 records...
 Found 45000 records...
 Found 45500 records...
 Found 46000 records...
 Found 46500 records...
 Found 47000 records...
 Found 47500 records...
 Found 48000 records...
 Found 48500 records...
 Found 49000 records...
 Found 49500 records...
 Found 50000 records...
 Found 50500 records...
 Found 51000 records...
 Found 51500 records...
 Found 52000 records...
 Found 52500 records...
 Found 53000 records...
 Found 53500 records...
 Found 54000 records...
 Found 54500 records...
 Found 55000 records...
 Found 55500 records...
 Found 56000 records...
 Found 56500 records...
 Found 57000 records...
 Found 57500 records...
 Found 58000 records...
 Found 58500 records...
 Found 59000 records...
 Found 59500 records...
 Found 60000 records...
 Found 60500 records...
 Found 61000 records...
 Found 61500 records...
 Found 62000 records...
 Found 62500 records...
 Found 63000 records...
 Found 63500 records...
 Found 64000 records...
 Found 64500 records...
 Found 65000 records...
 Found 65500 records...
 Found 66000 records...
 Found 66500 records...
 Found 67000 records...
 Found 67500 records...
 Found 68000 records...
 Found 68500 records...
 Found 69000 records...
 Found 69500 records...
 Found 70000 records...
 Found 70500 records...
 Found 71000 records...
 Found 71500 records...
 Found 72000 records...
 Found 72500 records...
 Found 73000 records...
 Found 73500 records...
 Found 74000 records...
 Found 74500 records...
 Found 75000 records...
 Found 75500 records...
 Found 76000 records...
 Found 76500 records...
 Found 77000 records...
 Found 77500 records...
 Found 78000 records...
 Found 78500 records...
 Found 79000 records...
 Found 79500 records...
 Found 80000 records...
 Found 80500 records...
 Found 81000 records...
 Found 81500 records...
 Found 82000 records...
 Found 82500 records...
 Found 83000 records...
 Found 83500 records...
 Found 84000 records...
 Found 84500 records...
 Found 85000 records...
 Found 85500 records...
 Found 86000 records...
 Found 86500 records...
 Found 87000 records...
 Found 87500 records...
 Found 88000 records...
 Found 88500 records...
 Found 89000 records...
 Found 89500 records...
 Found 90000 records...
 Found 90500 records...
 Found 91000 records...
 Found 91500 records...
 Found 92000 records...
 Found 92500 records...
 Found 93000 records...
 Found 93500 records...
 Found 94000 records...
 Found 94500 records...
 Found 95000 records...
 Found 95500 records...
 Found 96000 records...
 Found 96500 records...
 Found 97000 records...
 Found 97500 records...
 Found 98000 records...
 Found 98500 records...
 Found 99000 records...
 Found 99500 records...
 Found 1e+05 records...
 Found 100500 records...
 Found 101000 records...
 Found 101500 records...
 Found 102000 records...
 Found 102500 records...
 Found 103000 records...
 Found 103500 records...
 Found 104000 records...
 Found 104500 records...
 Found 105000 records...
 Found 105500 records...
 Found 106000 records...
 Found 106500 records...
 Found 107000 records...
 Found 107500 records...
 Found 108000 records...
 Found 108500 records...
 Found 109000 records...
 Found 109500 records...
 Found 110000 records...
 Found 110500 records...
 Found 111000 records...
 Found 111500 records...
 Found 112000 records...
 Found 112500 records...
 Found 113000 records...
 Found 113500 records...
 Found 114000 records...
 Found 114500 records...
 Found 115000 records...
 Found 115500 records...
 Found 116000 records...
 Found 116500 records...
 Found 117000 records...
 Found 117500 records...
 Found 118000 records...
 Found 118500 records...
 Found 119000 records...
 Found 119500 records...
 Found 120000 records...
 Found 120500 records...
 Found 121000 records...
 Found 121500 records...
 Found 122000 records...
 Found 122500 records...
 Found 123000 records...
 Found 123500 records...
 Found 124000 records...
 Found 124500 records...
 Found 125000 records...
 Found 125500 records...
 Found 126000 records...
 Found 126500 records...
 Found 127000 records...
 Found 127500 records...
 Found 128000 records...
 Found 128500 records...
 Found 129000 records...
 Found 129500 records...
 Found 130000 records...
 Found 130500 records...
 Found 131000 records...
 Found 131500 records...
 Found 132000 records...
 Found 132500 records...
 Found 133000 records...
 Found 133500 records...
 Found 134000 records...
 Found 134476 records...
 Imported 134476 records. Simplifying...
# add a review_id unique variable and unnest the helpful & total votes from a list to separate columns, rename 
reviews <- reviews_raw %>% 
  select(-c(asin, reviewerName, summary, reviewTime)) %>% 
  mutate(
    review_id = seq.int(nrow(reviews_raw)),
    rating = factor(overall)
  ) %>%
  unnest(helpful) %>%
  group_by(review_id) %>%
  mutate(col=seq_along(review_id)) %>%
  spread(key=col, value=helpful) %>% 
  rename(
    helpful_votes = `1`,
    total_votes = `2`
  )  

reviews <- reviews %>% 
    mutate(
    isHelpful = factor(ifelse(helpful_votes>0,1,0)),
    word_count = str_count(reviewText, pattern = "([A-Z]|[a-z])\\w+"),
    exclaim_count = str_count(reviewText, pattern = "!"),
    question_count = str_count(reviewText, pattern = "\\?"),
    char_count = str_length(reviewText),
    utc_time = lubridate::as_datetime(unixReviewTime),
    time_month = factor(lubridate::month(utc_time), ordered = TRUE, levels = c('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'))
  )


### 
# Note:  I was going to split observations on sentences, but ran out of time. Keeping it here for future reference
###
# number of words in all-caps
# reviews_split <- reviews %>%
#   mutate(
#     reviewText = str_split(reviewText, '[\\.][!][\\?]')
#   ) %>% 
#   unnest(reviewText).... DNF

Reduce the dataset

For some of the more computationally intense functions, some of the functions were running out of memory. I choose to reduce the dataset to focus on learning and practicing concepts.

## My intention is for this notebook to be a learning exercise. The original dataset was running out of memory during various functions, so reducing dataset arbitrarily from 134,476 to 30,000 observations.

# 30,000 Obs for POS tagging
reviews.reduced.indexes <- createDataPartition(reviews$isHelpful, times=1, p=(30000/nrow(reviews)), list=FALSE)
reduced.30 <- reviews[reviews.reduced.indexes,]

Exploratory Data Analysis

Checking for missing observations and plotting a few initial plots to get a feel for helpful and rating distribution.

# Check missing data - no teview text missing
which(!complete.cases(reviews$reviewText))
## integer(0)
# Rating text length seem to follow similar curve among all ratings
ggplot(reviews, aes(str_length(reviewText), fill=rating)) + 
  geom_histogram(binwidth=10)  +
  xlim(0,4000) +
  ggtitle('Review length ~ Rating')

# Unhelpful reviews do follow a different trajectory than helpful reviews. It appears there's a proportionally larger amount of helpful reviews with larger char length.
ggplot(reviews, aes(str_length(reviewText), fill=isHelpful)) + 
  geom_histogram(binwidth=10)  +
  xlim(0,4000) +
  ggtitle('Review length ~ Helpfulness')

# similar finding with word count
ggplot(reviews, aes(word_count, fill=isHelpful)) + 
  geom_histogram(binwidth=5)  +
  xlim(0,650) +
  ggtitle('Word count ~ Helpfulness')

# 
ggplot(reviews, aes(exclaim_count, fill=isHelpful)) + 
  geom_histogram(binwidth=1, position = 'dodge')  +
  xlim(0,10)  +
  ggtitle('Exclaimation count ~ Helpfulness')

# 
ggplot(reviews, aes(question_count, fill=isHelpful)) + 
  geom_histogram(binwidth=1, position = 'dodge')  +
  xlim(0,10) +
  ggtitle('Question mark count ~ Helpfulness')

# Most helpful reviews have under 10 votes, with a really long tail.
ggplot(reviews, aes(helpful_votes)) + 
  geom_histogram(binwidth=1)  +
  xlim(0,50) +
  ggtitle('Histogram of total helpful votes per review')

# While average character per word counts are about the same, unhelpful reviews have a great spread of outliers.
ggplot(reviews, aes((reviews$char_count)/(reviews$word_count), fill=isHelpful)) + 
  geom_boxplot() + 
  xlim(0,15)+
  ggtitle('Average word length')

# It appears fall reviews are more likely to be helpful. It would be interesting to see if a correlation exists between purchase amount within the cateogry and amount of reviews which are marked helpful.  
ggplot(reviews, aes(time_month, fill=isHelpful)) + 
  geom_histogram(stat='count')  +
  ggtitle('Helpfulness ~ Month')

chisq.test(reviews$isHelpful, reviews$time_month)
## 
##  Pearson's Chi-squared test
## 
## data:  reviews$isHelpful and reviews$time_month
## X-squared = 974.73, df = 11, p-value < 2.2e-16

Create corpus & dfm for further EDA

# Create corpus to use in next couple sections, and add a couple useful variables
reviews.corpus <- corpus(reviews$reviewText)
docvars(reviews.corpus, "review_id") <- reviews$review_id
docvars(reviews.corpus, 'rating') <- reviews$rating
docvars(reviews.corpus, 'reviewer') <- reviews$reviewerID
docvars(reviews.corpus, 'helpful') <- reviews$helpful_votes
docvars(reviews.corpus, 'isHelpful') <- reviews$isHelpful
docvars(reviews.corpus, 'word_count') <- reviews$word_count



# summary(reviews.corpus)

# create dfm
reviews.dfm <- dfm(reviews.corpus, remove=stopwords("english"), remove_punct=TRUE, remove_symbols=TRUE, remove_separators=TRUE)
reviews.tfidf <-dfm_tfidf(reviews.dfm)

Word Clouds for initial intuition

set.seed(100)


# word clous of top 100 words

# Nothing sticks out as too big or important here - the top 15/20 words are pretty generic. Interesting that s and t are common - i'm assuming these are posession and contraction indicating negativity. 
textplot_wordcloud(reviews.dfm, min_count = 6, random_order = FALSE,
                   rotation = .25, 
                   color = RColorBrewer::brewer.pal(8,"Dark2"))

# word cloud, sep on helpfulness
# Reviews which do not have a helpful vote up use generic adjectives like great, light, easy. Those reviews which had at least one helpful vote referenced parts of the tool or aspects of the product such as blade, edge, guide, lumens, fence, plate, motor, router, base. I have a hypothesis that these reviews tend to reference specific experiences with the product. 

reviews.dfm_helpful <- dfm(reviews.corpus, groups='isHelpful', remove=stopwords("english"), remove_punct=TRUE, remove_numbers=TRUE, remove_separators=TRUE)

textplot_wordcloud(reviews.dfm_helpful, comparison = TRUE, max_words = 350, color = RColorBrewer::brewer.pal(8,"Dark2"))

# word cloud, sep on rating
# Nothign too surprising here - reviews with high ratings are associated with ease of use, bad reviews are associted with things breaking, returning, and being a waste of money.
reviews.dfm_ratings <- dfm(reviews.corpus, groups='rating', remove=stopwords("english"), remove_punct=TRUE, remove_numbers=TRUE, remove_separators=TRUE)

textplot_wordcloud(reviews.dfm_ratings, comparison = TRUE, max_words = 150)

## Keyness

# It looks like keyed words with unhelpful reviews are adjectives and those keyed to helpful reviews are nouns. Will use POS to explore this further.
reviews.keyness <- textstat_keyness(reviews.dfm, target=reviews$isHelpful==1)
textplot_keyness(reviews.keyness, margin = 0.2, n = 10)

Part of Speech tagging

# 30,000 Obs for POS tagging
reviews.reduced.indexes <- createDataPartition(reviews$isHelpful, times=1, p=(20000/nrow(reviews)), list=FALSE)
reduced.30 <- reviews[reviews.reduced.indexes,]


## Trying a new dictionary for practice. This time will do percent of positive and negative equalling 1

start.time <- Sys.time()
cl <- makeCluster(3, type='SOCK')
registerDoSNOW(cl)


# using super reduced dataset for pos tagging
cnlp_init_udpipe()
reviews.pos <- cnlp_annotate(reduced.30$reviewText)

stopCluster(cl)
total.time <- Sys.time() - start.time
total.time
total.time
## Time difference of 30.60056 mins
reduced.30$doc_id = seq.int(nrow(reduced.30))

pos.docs <- left_join(reviews.pos$token,reduced.30, by='doc_id') %>% 
  select(c(doc_id, token_with_ws, upos, xpos, feats, relation, reviewerID, rating, helpful_votes, isHelpful))


# Even though proportionally, there's more unhelpful reviews in the dataset, there are proportionally a lot more identified pos in helpful reviews. 
round(prop.table(table(reduced.30$isHelpful)), 3)
## 
##     0     1 
## 0.584 0.416
pos.docs <- pos.docs %>% 
  mutate(
    upos = factor(upos),
    xpos = factor(xpos)
  ) 

pos.docs %>% 
  ggplot(aes(upos, fill=isHelpful)) + 
  geom_bar(position = "dodge2") + 
  ggtitle("Count of POS ~ helpfulness")

## chi-squared table for expected values??? based on length?

table(pos.docs$isHelpful, pos.docs$upos)
##    
##        ADJ    ADP    ADV    AUX  CCONJ    DET   INTJ   NOUN    NUM   PART
##   0  83142  85221  71441  66200  37792  99370   1409 190594  19308  32000
##   1 105273 117040  93614  88174  49624 140009   1826 266054  30880  41612
##    
##       PRON  PROPN  PUNCT  SCONJ    SYM   VERB      X
##   0 110129  13895 108044  20008   2360 126640    941
##   1 138858  24256 151407  28002   4521 167272   1303
# any unexpected outliers?
ggplot(reduced.30, aes(word_count, char_count))+
  geom_point()

summary(reduced.30)
##   reviewerID         reviewText           overall      unixReviewTime     
##  Length:20001       Length:20001       Min.   :1.000   Min.   :9.431e+08  
##  Class :character   Class :character   1st Qu.:4.000   1st Qu.:1.329e+09  
##  Mode  :character   Mode  :character   Median :5.000   Median :1.366e+09  
##                                        Mean   :4.379   Mean   :1.344e+09  
##                                        3rd Qu.:5.000   3rd Qu.:1.388e+09  
##                                        Max.   :5.000   Max.   :1.406e+09  
##                                                                           
##    review_id      rating    helpful_votes      total_votes      isHelpful
##  Min.   :     4   1:  737   Min.   :  0.000   Min.   :  0.000   0:11681  
##  1st Qu.: 33251   2:  701   1st Qu.:  0.000   1st Qu.:  0.000   1: 8320  
##  Median : 67484   3: 1544   Median :  0.000   Median :  0.000            
##  Mean   : 67342   4: 4287   Mean   :  3.088   Mean   :  3.601            
##  3rd Qu.:101045   5:12732   3rd Qu.:  1.000   3rd Qu.:  2.000            
##  Max.   :134475             Max.   :976.000   Max.   :993.000            
##                                                                          
##    word_count     exclaim_count    question_count       char_count     
##  Min.   :   0.0   Min.   : 0.000   Min.   : 0.00000   Min.   :    0.0  
##  1st Qu.:  34.0   1st Qu.: 0.000   1st Qu.: 0.00000   1st Qu.:  190.0  
##  Median :  62.0   Median : 0.000   Median : 0.00000   Median :  354.0  
##  Mean   : 103.2   Mean   : 0.349   Mean   : 0.09274   Mean   :  592.1  
##  3rd Qu.: 124.0   3rd Qu.: 0.000   3rd Qu.: 0.00000   3rd Qu.:  707.0  
##  Max.   :4635.0   Max.   :45.000   Max.   :18.00000   Max.   :26417.0  
##                                                                        
##     utc_time                     time_month       doc_id     
##  Min.   :1999-11-20 00:00:00   1      :2284   Min.   :    1  
##  1st Qu.:2012-02-11 00:00:00   12     :2144   1st Qu.: 5001  
##  Median :2013-04-17 00:00:00   3      :1844   Median :10001  
##  Mean   :2012-08-06 20:19:28   2      :1832   Mean   :10001  
##  3rd Qu.:2013-12-23 00:00:00   6      :1719   3rd Qu.:15001  
##  Max.   :2014-07-23 00:00:00   11     :1695   Max.   :20001  
##                                (Other):8483

Plot POS relationships

pos.totals <- pos.docs %>% 
  group_by(doc_id) %>% 
  count(upos) %>% 
  pivot_wider(names_from = upos, values_from=n, values_fill=0) %>% 
  rename(review_id = doc_id)

reduced.30.pos <- left_join(reduced.30, pos.totals, on=review_id)
## Joining, by = "review_id"
reduced.30.pos.vars <- reduced.30.pos %>% 
  mutate(
    noun_verb = NOUN/VERB,
    noun_adj = NOUN/ADJ,
    adj_verb = ADJ/VERB,
    noun_len = NOUN/word_count,
    verb_len = VERB/word_count,
    adj_len = ADJ/word_count,
    word_total = sum(reduced.30.pos[16:32])
  )

ggplot(reduced.30.pos.vars, aes(noun_verb, fill=isHelpful)) +
  geom_histogram(binwidth=.1) + 
  ggtitle("Histogram of Noun:Verb ratio per review")

ggplot(reduced.30.pos.vars, aes(noun_adj, fill=isHelpful)) +
  geom_histogram(binwidth=.5)+ 
  ggtitle("Histogram of Noun:Adjective ratio per review")

ggplot(reduced.30.pos.vars, aes(adj_verb, fill=isHelpful)) +
  geom_histogram(binwidth=.2)+ 
  ggtitle("Histogram of Adjective:Verb ratio per review")

ggplot(reduced.30.pos.vars, aes(noun_len, fill=isHelpful)) +
  geom_histogram(binwidth=.01) + 
  xlim(0,2.5)+ 
  ggtitle("Histogram of Noun:Review-Length Proportion per review")

ggplot(reduced.30.pos.vars, aes(verb_len, fill=isHelpful)) +
  geom_histogram(binwidth=.02) + 
  xlim(0,2)+ 
  ggtitle("Histogram of Verb:Review-Length Proportion per review")

ggplot(reduced.30.pos.vars, aes(adj_len, fill=isHelpful)) +
  geom_histogram(binwidth=.02) + 
  xlim(0,1)+ 
  ggtitle("Histogram of Adj::Review-Length Proportion per review")

colnames(reduced.30.pos.vars[14])
## [1] "utc_time"

Sentiment Analysis

Is sentiment generally on trend with ratings?

# apply quanteda's sentiment dictionary to dataset
# create tokens of reviews
reviews_tokens = tokens(reduced.30$reviewText, remove_punct = TRUE)

# apply sentiment dictionary provided by quanteda package
reviews_tokens_lsd <- tokens_lookup(reviews_tokens, dictionary =  data_dictionary_LSD2015)

# create weighted dfm to account for different number of reviews per reviewer
dfm_lsd <- dfm(reviews_tokens_lsd)
dfm_lsd_weighted <- dfm_weight(dfm_lsd, scheme='prop')

# convert to joinable datafram
dfm_lsd_weighted_df <- setDT(as.data.frame(dfm_lsd_weighted), keep.rownames='docs')


# join sentiment count and review dataset
reviews_sentiment <- reduced.30 %>% 
  mutate(
    docs = paste0("text", review_id)
  )  %>%
   inner_join(dfm_lsd_weighted_df, by=c('docs'='document'), copy=TRUE)


# plot ratings by positive sentiment count
# Helpful reiews are skewing less positive. I can imagine reviews that are super cheery and praising do not offer enough insight for someone to click a button saying it was helpful. This leads me to think that Amazon should not necessarily show all the positive and negative reviews, especailly if they're not providing 'value' - and instead first show summary ratings and reviews which are helpful and use progressive disclosure to show those reviews which are more generic.
reviews_sentiment %>% 
  ggplot(aes(isHelpful, positive)) + 
  geom_boxplot()  +
  ggtitle('Percent of Positive Sentiment in Review if they were helpful or not')

# Same with the negative sentimental reviews - helpful reviews seem to be more 'middle of the line' while those with strong sentiment in either direction are not seen as helpful. This intuitively makes sense because those who are on teh extremes are less likely (imo) to be realistic - describing both pros and cons.
# plot ratings by negative sentiment count 
reviews_sentiment %>% 
  arrange(isHelpful) %>% 
  ggplot(aes(isHelpful, negative)) + 
  geom_boxplot() +
  ggtitle('Percent of Negative Sentiment in Review  if they were helpful or not')

# I expected to see a stronger correlation here, but I don't think this plot is working to either support or debunct that hypothesis
reviews_sentiment %>% 
  ggplot(aes(positive, negative)) + 
  geom_point() +
  facet_wrap(~isHelpful) +
  ggtitle('Percent of Positive and Negative Sentiment in Reviews, Faceted by Rating')

### This section was from when I was initially focused on ratings. IT's still interesting, so keeping it.

# # plot ratings by positive sentiment count 
# reviews_sentiment %>% 
#   mutate(
#     overall = factor(overall, ordered = TRUE, levels=c('1', '2', '3', '4', '5'))
#   ) %>% 
#   ggplot(aes(overall, positive)) + 
#   geom_boxplot()  +
#   ggtitle('Percent of Positive Sentiment in Review ~ Rating')
# 
# # plot ratings by negative sentiment count 
# reviews_sentiment %>% 
#   mutate(
#     overall = factor(overall, ordered = TRUE, levels=c('1', '2', '3', '4', '5'))
#   ) %>%
#   arrange(overall) %>% 
#   ggplot(aes(overall, negative)) + 
#   geom_boxplot() +
#   ggtitle('Percent of Negative Sentiment in Review ~ Rating')
# 
# reviews_sentiment %>% 
#   mutate(
#     overall = factor(overall, ordered = TRUE, levels=c('1', '2', '3', '4', '5'))
#   ) %>% 
#   ggplot(aes(positive, negative)) + 
#   geom_point() +
#   facet_wrap(~overall) +
#   ggtitle('Percent of Positive and Negative Sentiment in Reviews, Faceted by Rating')




# TODO: 

Does month affect sentiment?

# plot ratings by positive sentiment count 
reviews_sentiment %>% 
  ggplot(aes(time_month, positive)) + 
  geom_boxplot()  +
  ggtitle('Percent of Positive Sentiment in Review ~ month')

# plot ratings by negative sentiment count 
reviews_sentiment %>% 
  ggplot(aes(time_month, negative)) + 
  geom_boxplot() +
  ggtitle('Percent of Negative Sentiment in Review ~ month')

Individual Reviewer Sentiment

### TODO: Do individual reviews tend to be either positive or negative across all reviews?

reviewer_stats <- reviews_sentiment %>% 
  group_by(reviewerID) %>% 
  summarize(
    total = n(),
    rating_avg = mean(overall),
    rating_std = sd(overall),
    sent_pos_avg = mean(positive),
    sent_pos_std = sd(positive),
    sent_neg_avg = mean(negative),
    sent_neg_std = sd(negative)
  )
## `summarise()` ungrouping output (override with `.groups` argument)
summary(reviewer_stats)
##   reviewerID            total          rating_avg      rating_std    
##  Length:2393        Min.   : 1.000   Min.   :1.000   Min.   :0.0000  
##  Class :character   1st Qu.: 1.000   1st Qu.:4.000   1st Qu.:0.0000  
##  Mode  :character   Median : 1.000   Median :5.000   Median :0.4045  
##                     Mean   : 1.239   Mean   :4.376   Mean   :0.5355  
##                     3rd Qu.: 1.000   3rd Qu.:5.000   3rd Qu.:0.7071  
##                     Max.   :21.000   Max.   :5.000   Max.   :2.8284  
##                                                      NA's   :2024    
##   sent_pos_avg     sent_pos_std     sent_neg_avg     sent_neg_std   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5238   1st Qu.:0.0955   1st Qu.:0.1000   1st Qu.:0.0943  
##  Median :0.7000   Median :0.1768   Median :0.2727   Median :0.1750  
##  Mean   :0.6864   Mean   :0.2027   Mean   :0.2874   Mean   :0.1924  
##  3rd Qu.:0.8750   3rd Qu.:0.2944   3rd Qu.:0.4286   3rd Qu.:0.2768  
##  Max.   :1.0000   Max.   :0.7071   Max.   :1.0000   Max.   :0.7071  
##                   NA's   :2024                      NA's   :2024
reviewer_stats %>%
  ggplot(aes(rating_avg, rating_std, color=sent_pos_avg)) +
  geom_point() +
  ggtitle('Percent of Negative Sentiment in Review ~ month')

# narrow down to single rating averages
reviewer_stats %>%
  filter(rating_avg>4) %>% 
  ggplot(aes(rating_avg, rating_std, color=sent_pos_avg)) +
  geom_point() +
  ggtitle('Percent of Negative Sentiment in Review ~ month')

# hypothesis: those who have a lower average rating but more positive reviews will have a larger standard deviation
reviewer_stats %>%
  filter(rating_avg<2.5) %>% 
  ggplot(aes(rating_avg, rating_std, color=sent_pos_avg, size=total)) +
  geom_point() +
  ggtitle('Percent of Negative Sentiment in Review ~ month')

# hypothesis: positive sentiment and rating standard deviation are correlated when rating average is low

low_ratings_sentiment <- reviewer_stats %>% 
  filter(rating_avg < 2)

# reviewer_stats_lm <- train(rating_avg~., reviewer_stats, method='lm')

Model Selection

Whether a review is helpful or not is important information for a product. Using a predictive model, we can filter new reviews which we think will be helpful up to the top for potential customers to get answers faster or think of things they didn’t consider. In this section, I’ll create test/train sets to compare several models predictability on whether a review receives at least 1 helpful rating.

Create train/test sets

# create the target variable first, to ensure stratification
reviews.strat <- reviews_raw %>% 
  mutate(
    review_id = seq.int(nrow(reviews_raw)),
  ) %>%
  unnest(helpful) %>%
  group_by(review_id) %>%
  mutate(col=seq_along(review_id)) %>%
  spread(key=col, value=helpful) %>% 
  rename(
    helpful_votes = `1`, 
    total_votes = `2`
  ) %>% 
  mutate(
    isHelpful = factor(ifelse(helpful_votes>5,1,0))
  ) %>% 
  select(-c(reviewerID, reviewerName, asin, summary, review_id))
## Adding missing grouping variables: `review_id`
## Using even more reduced set for POS tagging, SVM and RF training...
# 4,000 for model training
set.seed(345)
reviews.reduced.indexes <- createDataPartition(reviews.strat$isHelpful, times=1, p=(4000/nrow(reviews)), list=FALSE)
reduced.4 <- reviews.strat[reviews.reduced.indexes,]


# separate to test & train sets
set.seed(345)
indexes <- createDataPartition(reduced.4$isHelpful, times=1, p=.7, list=FALSE)

train <- reduced.4[indexes,]
test <- reduced.4[-indexes,] 

# check for target variable distribution
round(prop.table(table(reduced.4$isHelpful)), 3)
## 
##     0     1 
## 0.899 0.101
round(prop.table(table(train$isHelpful)), 3)
## 
##     0     1 
## 0.899 0.101
round(prop.table(table(test$isHelpful)), 3)
## 
##   0   1 
## 0.9 0.1
round((table(reduced.4$isHelpful)), 3)
## 
##    0    1 
## 3598  403
round((table(train$isHelpful)), 3)
## 
##    0    1 
## 2519  283
round((table(test$isHelpful)), 3)
## 
##    0    1 
## 1079  120
test.labels <- test$isHelpful
train.labels <- train$isHelpful

test <- test %>% 
  select(-c(isHelpful, helpful_votes, total_votes))

Prep Test set for model training

####
# Future things to try:  
# • convert numbers to words
# • if twitter, dont remove symbols
# • remove non-english words
# • cross-reference translation dictionary, especially with location-specfic
# • potentially correct common misspelling
####

train.pre <- train %>% 
  mutate(
    rating = factor(overall),
    utc_time = lubridate::as_datetime(unixReviewTime),
    time_month = factor(lubridate::month(utc_time), ordered = TRUE, levels = c('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12')),
    word_count = str_count(reviewText, pattern = "([A-Z]|[a-z])\\w+"),
    exclaim_count = str_count(reviewText, pattern = "!"),
    question_count = str_count(reviewText, pattern = "\\?"),
    char_count = str_length(reviewText),
    char_per_word = ifelse( word_count==0, 0,(char_count)/(word_count))
  ) %>% 
  select(-c(utc_time, unixReviewTime))



# Manually changing punctuation 
## Initially used this, but found it to hurt performance quite a bit
train.punct <- train %>% 
  mutate(
    reviewText = str_replace_all(reviewText, '\\.', ' punct.period '), 
    reviewText = str_replace_all(reviewText, '\\,', ' punct.comma '),
    reviewText = str_replace_all(reviewText, '\\;', ' punct.semicolon '),
    reviewText = str_replace_all(reviewText, '\\:', ' punct.colon '),
    reviewText = str_replace_all(reviewText, '\\[', ' punct.bracket '),
    reviewText = str_replace_all(reviewText, '\\]', ' punct.bracket '),
    reviewText = str_replace_all(reviewText, '\\(', ' punct.paren '),
    reviewText = str_replace_all(reviewText, '\\)', ' punct.paren '),
    reviewText = str_replace_all(reviewText, '\\-', ' punct.dash '),
    reviewText = str_replace_all(reviewText, '\\—', ' punct.dash '),
    reviewText = str_replace_all(reviewText, '\\+', ' punct.plus '),
    reviewText = str_replace_all(reviewText, '\\=', ' punct.equal '),
    reviewText = str_replace_all(reviewText, '\\!', ' punct.bang '),
    reviewText = str_replace_all(reviewText, '\\@', ' punct.at '),
    reviewText = str_replace_all(reviewText, '\\#', ' punct.hash '),
    reviewText = str_replace_all(reviewText, '\\?', ' punct.question '),
    reviewText = str_replace_all(reviewText, '\\$', ' punct.dollar '),
    reviewText = str_replace_all(reviewText, '\\%', ' punct.percent '),
    reviewText = str_replace_all(reviewText, '\\&', ' punct.amp '),
    reviewText = str_replace_all(reviewText, '\\*', ' punct.aster '),
    reviewText = str_replace_all(reviewText, '\\\'', ' punct.quot '),
    reviewText = str_replace_all(reviewText, '\\"', ' punct.quot '),
    reviewText = str_replace_all(reviewText, '\\`', ' punct.quot ')
  )



# create new token set for model training and dfm
train.tokens <- tokens(train$reviewText, 
                       what = "word",
                       remove_numbers = TRUE,
                       remove_symbols = TRUE, 
                       remove_punct = TRUE,
                       split_hyphens = TRUE,
                       remove_separators = TRUE
                       )

train.tokens.dfm <- train.tokens %>% 
  tokens_tolower() %>% 
  tokens_remove(stopwords(source = 'smart')) %>% 
  tokens_wordstem(language = 'english') %>% 
  dfm()

train.tfidf <- train.tokens.dfm %>% 
  dfm_trim(min_termfreq = 10, min_docfreq = 2) %>% 
  dfm_tfidf()

train.tfidf.df <- cbind(Label=train$isHelpful, data.frame(train.tfidf))
names(train.tfidf.df) <- make.names(names(train.tfidf.df))


train.tokens.matrix <- as.matrix(train.tokens.dfm)
train.tokens.df <- cbind(Label = train$isHelpful, convert(train.tokens.dfm, to='data.frame'))

names(train.tokens.df) <- make.names(names(train.tokens.df))


start.time <- Sys.time()

train.lsa <- irlba(t(train.tfidf), nv=300, maxit=600)
train.svd <- data.frame(Label = train$isHelpful, train.lsa$v)

total.time <- Sys.time() - start.time
total.time
## Time difference of 48.35673 secs
# save for test dataset 
sigma.inverse <- 1/train.lsa$d
u.transpose <- t(train.lsa$u)


dim(u.transpose)
## [1]  300 1761
which(!complete.cases(train.svd))
## integer(0)
## Adding engineered features to separate training set

train.tfidf.df.eng <- train.tfidf.df %>% 
  mutate(
    month = train.pre$time_month,
    word_count = train.pre$word_count,
    exclaim_count = train.pre$exclaim_count,
    question_count = train.pre$question_count,
    char_count = train.pre$char_count,
    char_per_word = train.pre$char_per_word,
    rating = train.pre$rating
  )

train.svd.eng <- train.svd %>% 
  mutate(
    month = train.pre$time_month,
    word_count = train.pre$word_count,
    exclaim_count = train.pre$exclaim_count,
    question_count = train.pre$question_count,
    char_count = train.pre$char_count,
    char_per_word = train.pre$char_per_word,
    rating = train.pre$rating
  )
# 
# 
# # apply quanteda's sentiment dictionary to dataset
# # create tokens of reviews
# train.tokens = tokens(train$reviewText, remove_punct = TRUE)
# 
# # apply sentiment dictionary provided by quanteda package
# train.tokens.lsd <- tokens_lookup(train.tokens, dictionary =  data_dictionary_LSD2015)
# 
# # create weighted dfm to account for different number of reviews per reviewer
# train.dfm.lsd <- dfm(train.tokens.lsd)
# train.dfm.lsd.weighted <- dfm_weight(train.dfm.lsd, scheme='prop')
# 
# # convert to joinable datafram
# train.dfm.lsd.weighted.df <- setDT(as.data.frame(train.dfm.lsd.weighted), keep.rownames='docs')
# train.dfm.lsd.weighted.df <- train.dfm.lsd.weighted.df %>% 
#   mutate(
#     docs = 
#   )
# 
# 
# # join sentiment count and review dataset
# train.sentiment <- train 
#   mutate(
#     docs = paste0("text", review_id)
#   )  %>%
#    inner_join(dfm_lsd_weighted_df, by=c('docs'='document'), copy=TRUE)
# 
#   

Train base models

Naive Bayes model

set.seed(5678)

model.nb <- textmodel_nb(train.tokens.dfm, train$isHelpful)
summary(model.nb)
## 
## Call:
## textmodel_nb.dfm(x = train.tokens.dfm, y = train$isHelpful)
## 
## Class Priors:
## (showing first 2 elements)
##   0   1 
## 0.5 0.5 
## 
## Estimated Feature Scores:
##       good    inform   project    includ      book     level    great  interest
## 0 0.007262 0.0001395 0.0013111 0.0016179 0.0001953 0.0013575 0.006825 5.579e-05
## 1 0.004395 0.0003003 0.0009009 0.0009282 0.0001638 0.0009009 0.003030 2.730e-04
##        feet       wet    pocket     joint   bought      kreg    master
## 0 0.0005207 0.0003161 0.0009577 0.0002139 0.003450 0.0002696 0.0001674
## 1 0.0004095 0.0002457 0.0008463 0.0005460 0.002239 0.0003822 0.0002730
##         kit     built   bookcas    beauti     hitch    variat      trim
## 0 0.0011065 0.0010228 2.789e-05 0.0003068 5.579e-05 4.649e-05 0.0003161
## 1 0.0009828 0.0008736 5.460e-05 0.0002730 1.092e-04 1.092e-04 0.0004914
##        plan     move     chest    drawer     doubt     basic    caveat
## 0 0.0004556 0.001153 0.0001023 0.0001674 0.0001860 0.0004463 0.0000186
## 1 0.0003549 0.001010 0.0001092 0.0003822 0.0001092 0.0004914 0.0001092
##       extra
## 0 0.0010507
## 1 0.0007917

SVM

set.seed(5678)

cv.folds <- createMultiFolds(train.svd$Label, k=10, times=2)
cv.cntrl <- trainControl(method = 'repeatedcv', number=10, repeats=2, index=cv.folds)

start.time <- Sys.time()

cl <- makeCluster(3, type='SOCK')
registerDoSNOW(cl)

svm.linear <- train(Label ~ ., 
                    data=train.svd, 
                    method='svmLinear', 
                    preProcess = c("center", "scale"),
                    trControl = cv.cntrl, tuneLength=7,
                    na.action=na.exclude
                    )

on.exit(stopCluster(cl))
   
total.time <- Sys.time() - start.time
total.time
## Time difference of 2.165601 mins
svm.linear
## Support Vector Machines with Linear Kernel 
## 
## 2802 samples
##  300 predictor
##    2 classes: '0', '1' 
## 
## Pre-processing: centered (300), scaled (300) 
## Resampling: Cross-Validated (10 fold, repeated 2 times) 
## Summary of sample sizes: 2521, 2522, 2522, 2522, 2521, 2523, ... 
## Resampling results:
## 
##   Accuracy   Kappa   
##   0.8825887  0.206604
## 
## Tuning parameter 'C' was held constant at a value of 1

Single Decision Tree Random Forrest

set.seed(5678)

start.time <- Sys.time()

cl <- makeCluster(3, type='SOCK')
registerDoSNOW(cl)

rpart.cv <- train(Label ~ ., 
                    data=train.tfidf.df, 
                    method='rpart', 
                    trControl = cv.cntrl, tuneLength=7
                    )

stopCluster(cl)

total.time <- Sys.time() - start.time
total.time
## Time difference of 3.093847 mins
rpart.cv
## CART 
## 
## 2802 samples
## 1762 predictors
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times) 
## Summary of sample sizes: 2521, 2522, 2522, 2522, 2521, 2523, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa     
##   0.001766784  0.8834758  0.10195526
##   0.002355713  0.8845485  0.10061579
##   0.003533569  0.8856200  0.08966964
##   0.007067138  0.8906168  0.07029307
##   0.008833922  0.8931155  0.06041990
##   0.014134276  0.8952558  0.03991439
##   0.015901060  0.8965027  0.01762717
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01590106.

Random Forrest

###
# Including code here, but not evaluating due to hardware performance
####

#set.seed(5678)
# 
# start.time <- Sys.time()
# 
# cl <- makeCluster(3, type='SOCK')
# registerDoSNOW(cl)
# 
# randforr.cv <- train(Label ~ ., 
#                     data=train.svd, 
#                     method='rf', 
#                     trControl = cv.cntrl, tuneLength=7
#                     )
# 
# stopCluster(cl)
# 
# total.time <- Sys.time() - start.time
# total.time
# 
# randforr.cv
# 
# varImpPlot(randforr.cv$finalModel)

Train new models with engineered features

SVM

set.seed(5678)

cv.folds <- createMultiFolds(train.svd$Label, k=10, times=2)
cv.cntrl <- trainControl(method = 'repeatedcv', number=10, repeats=2, index=cv.folds)

start.time <- Sys.time()

cl <- makeCluster(3, type='SOCK')
registerDoSNOW(cl)

svm.linear.eng <- train(Label ~ ., 
                    data=train.svd.eng, 
                    method='svmLinear', 
                    preProcess = c("center", "scale"),
                    trControl = cv.cntrl, tuneLength=7,
                    na.action=na.exclude
                    )

on.exit(stopCluster(cl))
   
total.time <- Sys.time() - start.time
total.time
## Time difference of 2.262214 mins
svm.linear.eng
## Support Vector Machines with Linear Kernel 
## 
## 2802 samples
##  307 predictor
##    2 classes: '0', '1' 
## 
## Pre-processing: centered (320), scaled (320) 
## Resampling: Cross-Validated (10 fold, repeated 2 times) 
## Summary of sample sizes: 2521, 2522, 2522, 2522, 2521, 2523, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8790153  0.2070234
## 
## Tuning parameter 'C' was held constant at a value of 1

Single Decision Tree Random Forrest

set.seed(5678)

start.time <- Sys.time()

cl <- makeCluster(3, type='SOCK')
registerDoSNOW(cl)

rpart.cv.eng <- train(Label ~ ., 
                    data=train.tfidf.df.eng, 
                    method='rpart', 
                    trControl = cv.cntrl, tuneLength=7
                    )

stopCluster(cl)

total.time <- Sys.time() - start.time
total.time
## Time difference of 2.952629 mins
rpart.cv.eng
## CART 
## 
## 2802 samples
## 1768 predictors
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times) 
## Summary of sample sizes: 2521, 2522, 2522, 2522, 2521, 2523, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa     
##   0.003533569  0.8857973  0.16706031
##   0.007067138  0.8895434  0.15308092
##   0.007950530  0.8929337  0.14758278
##   0.008833922  0.8936500  0.14619550
##   0.010600707  0.8956111  0.14497188
##   0.012367491  0.8970384  0.12719004
##   0.013250883  0.8965027  0.09889021
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01236749.

Prep Test Set

test.pre <- test %>% 
  mutate(
    rating = factor(overall),
    utc_time = lubridate::as_datetime(unixReviewTime),
    time_month = factor(lubridate::month(utc_time), ordered = TRUE, levels = c('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12')),
    word_count = str_count(reviewText, pattern = "([A-Z]|[a-z])\\w+"),
    exclaim_count = str_count(reviewText, pattern = "!"),
    question_count = str_count(reviewText, pattern = "\\?"),
    char_count = str_length(reviewText),
    char_per_word = ifelse( is.na((char_count)/(word_count)), 0,(char_count)/(word_count))
  ) %>% 
  select(-c(utc_time, unixReviewTime))


# Manually changing punctuation 
test.punct <- test %>% 
  mutate(
    reviewText = str_replace_all(reviewText, '\\.', ' punct.period '), 
    reviewText = str_replace_all(reviewText, '\\,', ' punct.comma '),
    reviewText = str_replace_all(reviewText, '\\;', ' punct.semicolon '),
    reviewText = str_replace_all(reviewText, '\\:', ' punct.colon '),
    reviewText = str_replace_all(reviewText, '\\[', ' punct.bracket '),
    reviewText = str_replace_all(reviewText, '\\]', ' punct.bracket '),
    reviewText = str_replace_all(reviewText, '\\(', ' punct.paren '),
    reviewText = str_replace_all(reviewText, '\\)', ' punct.paren '),
    reviewText = str_replace_all(reviewText, '\\-', ' punct.dash '),
    reviewText = str_replace_all(reviewText, '\\—', ' punct.dash '),
    reviewText = str_replace_all(reviewText, '\\+', ' punct.plus '),
    reviewText = str_replace_all(reviewText, '\\=', ' punct.equal '),
    reviewText = str_replace_all(reviewText, '\\!', ' punct.bang '),
    reviewText = str_replace_all(reviewText, '\\@', ' punct.at '),
    reviewText = str_replace_all(reviewText, '\\#', ' punct.hash '),
    reviewText = str_replace_all(reviewText, '\\?', ' punct.question '),
    reviewText = str_replace_all(reviewText, '\\$', ' punct.dollar '),
    reviewText = str_replace_all(reviewText, '\\%', ' punct.percent '),
    reviewText = str_replace_all(reviewText, '\\&', ' punct.amp '),
    reviewText = str_replace_all(reviewText, '\\*', ' punct.aster '),
    reviewText = str_replace_all(reviewText, '\\\'', ' punct.quot '),
    reviewText = str_replace_all(reviewText, '\\"', ' punct.quot '),
    reviewText = str_replace_all(reviewText, '\\`', ' punct.quot ')
  )




# create new token set for model training and dfm
test.tokens <- tokens(test$reviewText, 
                       what = "word",
                       remove_numbers = TRUE,
                       remove_symbols = TRUE, 
                       remove_punct = TRUE,
                       split_hyphens = TRUE,
                       remove_separators = TRUE
                       )

test.tokens.dfm <- test.tokens %>% 
  tokens_tolower() %>% 
  tokens_remove(stopwords(source = 'smart')) %>% 
  tokens_wordstem(language = 'english') %>% 
  dfm() 


test.tfidf <-  test.tokens.dfm %>%
  dfm_trim(min_termfreq = 10, min_docfreq = 2) %>%
  dfm_tfidf() %>% 
  dfm_select(pattern=train.tfidf, selection="keep")

test.tfidf.df <- data.frame(test.tfidf)
names(test.tfidf.df) <- make.names(names(test.tfidf.df))



test.tokens.matrix <- as.matrix(test.tokens.dfm)
test.tokens.df <- convert(test.tokens.dfm, to='data.frame')
names(test.tokens.df) <- make.names(names(test.tokens.df))


test.svd.raw <- t(sigma.inverse * u.transpose %*% t(test.tfidf))
test.svd <- data.frame(as.matrix(test.svd.raw))

test.svd.eng <- test.svd %>% 
  mutate(
    month = test.pre$time_month,
    word_count = test.pre$word_count,
    exclaim_count = test.pre$exclaim_count,
    question_count = test.pre$question_count,
    char_count = test.pre$char_count,
    char_per_word = test.pre$char_per_word,
    rating = test.pre$rating
  )

## Adding engineered features to separate training set

test.tfidf.df.eng <- test.tfidf.df %>% 
  mutate(
    month = test.pre$time_month,
    word_count = test.pre$word_count,
    exclaim_count = test.pre$exclaim_count,
    question_count = test.pre$question_count,
    char_count = test.pre$char_count,
    char_per_word = test.pre$char_per_word,
    rating = test.pre$rating
  )

Evaluate

Confusion Matrix for train set

# Naive Bayes
nb.predicted <- predict(model.nb, train.tokens.dfm)
confusionMatrix(train$isHelpful, nb.predicted)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2413  106
##          1   74  209
##                                          
##                Accuracy : 0.9358         
##                  95% CI : (0.926, 0.9446)
##     No Information Rate : 0.8876         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.6632         
##                                          
##  Mcnemar's Test P-Value : 0.02085        
##                                          
##             Sensitivity : 0.9702         
##             Specificity : 0.6635         
##          Pos Pred Value : 0.9579         
##          Neg Pred Value : 0.7385         
##              Prevalence : 0.8876         
##          Detection Rate : 0.8612         
##    Detection Prevalence : 0.8990         
##       Balanced Accuracy : 0.8169         
##                                          
##        'Positive' Class : 0              
## 
# Support Vector Machine
svm.predicted <- predict(svm.linear, train.svd)
confusionMatrix(train.svd$Label, svm.predicted)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2512    7
##          1  160  123
##                                          
##                Accuracy : 0.9404         
##                  95% CI : (0.931, 0.9489)
##     No Information Rate : 0.9536         
##     P-Value [Acc > NIR] : 0.9994         
##                                          
##                   Kappa : 0.5682         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9401         
##             Specificity : 0.9462         
##          Pos Pred Value : 0.9972         
##          Neg Pred Value : 0.4346         
##              Prevalence : 0.9536         
##          Detection Rate : 0.8965         
##    Detection Prevalence : 0.8990         
##       Balanced Accuracy : 0.9431         
##                                          
##        'Positive' Class : 0              
## 
# Single Tree
rpart.predicted <- predict(rpart.cv, train.tfidf.df)
confusionMatrix(train.labels, rpart.predicted)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2519    0
##          1  283    0
##                                           
##                Accuracy : 0.899           
##                  95% CI : (0.8872, 0.9099)
##     No Information Rate : 1               
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.899           
##             Specificity :    NA           
##          Pos Pred Value :    NA           
##          Neg Pred Value :    NA           
##              Prevalence : 1.000           
##          Detection Rate : 0.899           
##    Detection Prevalence : 0.899           
##       Balanced Accuracy :    NA           
##                                           
##        'Positive' Class : 0               
## 
# Support Vector Machine
svm.predicted.eng <- predict(svm.linear, train.svd.eng)
confusionMatrix(train.svd.eng$Label, svm.predicted.eng)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2512    7
##          1  160  123
##                                          
##                Accuracy : 0.9404         
##                  95% CI : (0.931, 0.9489)
##     No Information Rate : 0.9536         
##     P-Value [Acc > NIR] : 0.9994         
##                                          
##                   Kappa : 0.5682         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9401         
##             Specificity : 0.9462         
##          Pos Pred Value : 0.9972         
##          Neg Pred Value : 0.4346         
##              Prevalence : 0.9536         
##          Detection Rate : 0.8965         
##    Detection Prevalence : 0.8990         
##       Balanced Accuracy : 0.9431         
##                                          
##        'Positive' Class : 0              
## 
# Single Tree
rpart.predicted.eng <- predict(rpart.cv.eng, train.tfidf.df.eng)
confusionMatrix(train.tfidf.df$Label, rpart.predicted.eng)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2509   10
##          1  240   43
##                                           
##                Accuracy : 0.9108          
##                  95% CI : (0.8996, 0.9211)
##     No Information Rate : 0.9811          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2315          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9127          
##             Specificity : 0.8113          
##          Pos Pred Value : 0.9960          
##          Neg Pred Value : 0.1519          
##              Prevalence : 0.9811          
##          Detection Rate : 0.8954          
##    Detection Prevalence : 0.8990          
##       Balanced Accuracy : 0.8620          
##                                           
##        'Positive' Class : 0               
## 
# Random Forest
# confusionMatrix(train.svd$Label, randforr.cv$finalModel$predicted)

Confusion Matrix for Test Set

Specificity is the most important metric to me - I want to be able to accurately predict is a review is going to be helpful, so I can alter it’s appearance in the UI. A single decision tree had the highest specifity at 53.9% and overall accuracy of 60.3%. As a PM, this isn’t high enough for me to hide all reviews entirely, but maybe highlight a few which we have high confidence in.

# Naive Bayes
nb.predicted.test <- predict(model.nb, newdata = dfm_match(test.tokens.dfm, features = featnames(train.tokens.dfm)))
confusionMatrix(test.labels, nb.predicted.test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1009   70
##          1  106   14
##                                           
##                Accuracy : 0.8532          
##                  95% CI : (0.8319, 0.8728)
##     No Information Rate : 0.9299          
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : 0.0598          
##                                           
##  Mcnemar's Test P-Value : 0.008334        
##                                           
##             Sensitivity : 0.9049          
##             Specificity : 0.1667          
##          Pos Pred Value : 0.9351          
##          Neg Pred Value : 0.1167          
##              Prevalence : 0.9299          
##          Detection Rate : 0.8415          
##    Detection Prevalence : 0.8999          
##       Balanced Accuracy : 0.5358          
##                                           
##        'Positive' Class : 0               
## 
# Support Vector Machine
svm.predicted.test <- predict(svm.linear, test.svd)
confusionMatrix(test.labels, svm.predicted.test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1064   15
##          1  102   18
##                                           
##                Accuracy : 0.9024          
##                  95% CI : (0.8842, 0.9186)
##     No Information Rate : 0.9725          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2008          
##                                           
##  Mcnemar's Test P-Value : 1.855e-15       
##                                           
##             Sensitivity : 0.9125          
##             Specificity : 0.5455          
##          Pos Pred Value : 0.9861          
##          Neg Pred Value : 0.1500          
##              Prevalence : 0.9725          
##          Detection Rate : 0.8874          
##    Detection Prevalence : 0.8999          
##       Balanced Accuracy : 0.7290          
##                                           
##        'Positive' Class : 0               
## 
# Single Tree
rpart.predicted.test <- predict(rpart.cv, test.tfidf.df)
confusionMatrix(test.labels, rpart.predicted.test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1079    0
##          1  120    0
##                                           
##                Accuracy : 0.8999          
##                  95% CI : (0.8815, 0.9163)
##     No Information Rate : 1               
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8999          
##             Specificity :     NA          
##          Pos Pred Value :     NA          
##          Neg Pred Value :     NA          
##              Prevalence : 1.0000          
##          Detection Rate : 0.8999          
##    Detection Prevalence : 0.8999          
##       Balanced Accuracy :     NA          
##                                           
##        'Positive' Class : 0               
## 
# # Random Forest
# randforr.predicted.test <- predict(randforr.cv, test.svd)
# confusionMatrix(test.labels, randforr.predicted.test)

# Support Vector Machine - Engineered Features
svm.predicted.test.eng <- predict(svm.linear.eng, test.svd.eng)
confusionMatrix(test.labels, svm.predicted.test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1064   15
##          1  102   18
##                                           
##                Accuracy : 0.9024          
##                  95% CI : (0.8842, 0.9186)
##     No Information Rate : 0.9725          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2008          
##                                           
##  Mcnemar's Test P-Value : 1.855e-15       
##                                           
##             Sensitivity : 0.9125          
##             Specificity : 0.5455          
##          Pos Pred Value : 0.9861          
##          Neg Pred Value : 0.1500          
##              Prevalence : 0.9725          
##          Detection Rate : 0.8874          
##    Detection Prevalence : 0.8999          
##       Balanced Accuracy : 0.7290          
##                                           
##        'Positive' Class : 0               
## 
# Single Tree - Engineered Features
rpart.predicted.test.eng <- predict(rpart.cv.eng, test.tfidf.df.eng)
confusionMatrix(test.labels, rpart.predicted.test.eng)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1074    5
##          1  116    4
##                                           
##                Accuracy : 0.8991          
##                  95% CI : (0.8806, 0.9156)
##     No Information Rate : 0.9925          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0487          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.90252         
##             Specificity : 0.44444         
##          Pos Pred Value : 0.99537         
##          Neg Pred Value : 0.03333         
##              Prevalence : 0.99249         
##          Detection Rate : 0.89575         
##    Detection Prevalence : 0.89992         
##       Balanced Accuracy : 0.67348         
##                                           
##        'Positive' Class : 0               
##